{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 9,
   "source": [
    "from config import *\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "q_std_map = pd.read_csv('F:/data/corpus/qa/FinanceFAQ/input/q_std.tsv', sep='\\t', encoding='utf-8')['0'].to_dict()\n",
    "query_pair = pd.read_csv('F:/data/corpus/qa/FinanceFAQ/input/query_pair_0.tsv', sep='\\t', encoding='utf-8')\n",
    "query_pair['q_std'] = query_pair['q_std'].map(q_std_map)\n",
    "query_pair.to_csv(fst_train_file, sep='\\t', encoding='utf-8', index=False)\n",
    "query_pair.iloc[5:9]"
   ],
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>q_std</th>\n",
       "      <th>q_sim</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>现金利能否直接购买股票</td>\n",
       "      <td>就是说现金利是可以卖股票的对吗</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>损益表的介绍</td>\n",
       "      <td>损益表是啥</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>#股票名称#季度报告</td>\n",
       "      <td>详细说下600338第一季报吧</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>未成交的介绍</td>\n",
       "      <td>需要知道未成交是什么</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         q_std            q_sim\n",
       "5  现金利能否直接购买股票  就是说现金利是可以卖股票的对吗\n",
       "6       损益表的介绍            损益表是啥\n",
       "7   #股票名称#季度报告  详细说下600338第一季报吧\n",
       "8       未成交的介绍       需要知道未成交是什么"
      ]
     },
     "metadata": {},
     "execution_count": 9
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "source": [
    "query_pair.q_sim.str.len().describe()"
   ],
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "count    50000.000000\n",
       "mean        18.549940\n",
       "std          7.961594\n",
       "min          1.000000\n",
       "25%         13.000000\n",
       "50%         18.000000\n",
       "75%         23.000000\n",
       "max        108.000000\n",
       "Name: q_sim, dtype: float64"
      ]
     },
     "metadata": {},
     "execution_count": 10
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "source": [
    "# 为每个q_std随机选择一个q_sim作为dev集\n",
    "q_std_list = query_pair['q_std'].unique().tolist()\n",
    "query_pair['test_rnd'] = query_pair.q_std.apply(lambda x: np.random.rand())\n",
    "query_pair['nrank_test'] = query_pair.groupby('q_std')['test_rnd'].rank(ascending=0, method='first')\n",
    "dev_query_pair =  query_pair[query_pair.nrank_test<=1][['q_std', 'q_sim']]\n",
    "dev_query_pair.head(5)"
   ],
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>q_std</th>\n",
       "      <th>q_sim</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>回档的介绍</td>\n",
       "      <td>回档是什么东西</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>合约未了结情况下，卖出担保品时，提示零股不支持交易的解决方式</td>\n",
       "      <td>合约未了结情况下，卖出担保品时，解决提醒零股不能买卖的方式可能是什么</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>任职基金数大于#数字实体#的基金经理有哪些</td>\n",
       "      <td>想知道任职基金数大于50的基金经理有哪些</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>市销率大于#数字实体#的行业有哪些</td>\n",
       "      <td>我来咨询看看市销率大于100行业都有啥</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>竞价涨幅不小于#数字实体#的#地域板块#股票有哪些</td>\n",
       "      <td>给我发下都有啥内蒙板块股票是竞价涨幅不少于50</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                             q_std                               q_sim\n",
       "1                            回档的介绍                             回档是什么东西\n",
       "11  合约未了结情况下，卖出担保品时，提示零股不支持交易的解决方式  合约未了结情况下，卖出担保品时，解决提醒零股不能买卖的方式可能是什么\n",
       "15           任职基金数大于#数字实体#的基金经理有哪些                想知道任职基金数大于50的基金经理有哪些\n",
       "16               市销率大于#数字实体#的行业有哪些                 我来咨询看看市销率大于100行业都有啥\n",
       "31       竞价涨幅不小于#数字实体#的#地域板块#股票有哪些             给我发下都有啥内蒙板块股票是竞价涨幅不少于50"
      ]
     },
     "metadata": {},
     "execution_count": 11
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "source": [
    "# 为所有的query配一个qid\n",
    "q_std_dev = dev_query_pair.q_std.unique().tolist()\n",
    "q_sim_dev = dev_query_pair.q_sim.unique().tolist()\n",
    "q_qid = q_std_dev + q_sim_dev\n",
    "q_qid = list(set(q_qid))\n",
    "q_qid_dict = {i+1:q_qid[i] for i in range(0, len(q_qid))}  # {id: query}\n",
    "q_qid_dict_inv = {v: k for k, v in q_qid_dict.items()}  # {query: id}\n",
    "\n",
    "# 建立ir_corpus： [q_id, q_std]的映射\n",
    "ir_corpus = {q_qid_dict_inv[v]: v for v in q_std_list if v not in q_sim_dev}\n",
    "ir_corpus_df = pd.DataFrame(list(ir_corpus.items()), columns=['qid', 'question']).sort_values('qid').reset_index(drop=True)\n",
    "ir_corpus_df.to_csv(ir_path, sep='\\t', index=False)\n",
    "\n",
    "# 保存dev\n",
    "dev_query_pair['qid'] = dev_query_pair.q_sim.map(q_qid_dict_inv)\n",
    "dev_query_pair['duplicate_qids'] = dev_query_pair.q_std.map(q_qid_dict_inv)\n",
    "dev_query_pair.duplicate_qids = dev_query_pair.duplicate_qids.astype('str')\n",
    "dev_query_pair = dev_query_pair.groupby(['q_sim', 'qid']).apply(lambda v: ','.join(v['duplicate_qids'])).reset_index(name='duplicate_qids')[['qid', 'q_sim', 'duplicate_qids']]\n",
    "dev_query_pair.to_csv(fst_dev_file, sep='\\t', index=False)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "print('读取数据集并分别保存标问、相似问、所有语料: ', fst_train_file)\n",
    "std_data = pd.read_csv(fst_train_file, sep=\"\\t\")\n",
    "q_std_list = std_data.q_std.unique().tolist()  # 标准问list\n",
    "q_sim_list = std_data.q_sim.unique().tolist()  # 相似问list\n",
    "q_corpus = list(set(q_std_list + q_sim_list))\n",
    "\n",
    "q_std_df = pd.DataFrame(q_std_list, columns=['q'])\n",
    "q_corpus_df = pd.DataFrame(q_corpus, columns=['q'])\n",
    "q_sim_df = pd.DataFrame(q_sim_list, columns=['q'])\n",
    "\n",
    "q_std_df.to_csv(q_std_file, index=None, header=False, sep=\"\\t\")\n",
    "q_corpus_df.to_csv(q_corpus_file, index=None, header=False, sep=\"\\t\")\n",
    "q_sim_df.to_csv(q_sim_file, index=None, header=False, sep=\"\\t\")\n",
    "\n",
    "print('q_std_list:——>', len(q_std_list), 'q_sim_list:——>', len(q_sim_list), 'q_corpus:——>', len(q_corpus))"
   ],
   "outputs": [],
   "metadata": {}
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.8.8 ('base')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "e42634819b8c191a5d07eaf23810ff32516dd8d3875f28ec3e488928fbd3c187"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}